package org.mbte.groovypp.examples.wordcount;

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileReader;
import java.io.IOException;
import java.io.PrintWriter;
import java.io.Writer;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * Optimization of the original JavaWordCount found here:
 * http://code.google.com/p/groovypptest/source/browse/trunk/WordCount/src/org/mbte/groovypp/examples/wordcount/JavaWordCount.java
 * 1. Collect the counts w/ a backing HashMap, rather than TreeHashMap
 * 2. Word count needs to be case-insensitive (which is actually an optimization)
 * 3. Buffered writers
 */
public class JavaWordCount {
	public static void main(String[] args) throws IOException {

		for (int i = 0; i < 10; ++i) {
			Comparator<Map.Entry<String, Integer>> sortByCount = new Comparator<Map.Entry<String, Integer>>() {
				public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
					return o2.getValue() - o1.getValue();
				}
			};
			Comparator<Map.Entry<String, Integer>> sortByElement = new Comparator<Map.Entry<String, Integer>>() {
				public int compare(Map.Entry<String, Integer> o1, Map.Entry<String, Integer> o2) {
					return o1.getKey().compareTo(o2.getKey());
				}
			};
			Long timeStart = System.currentTimeMillis();
			Pattern wordPattern = Pattern.compile("\\w+");
			File rootDir = new File("./20_newsgroups");
			CountingSet counter = new CountingSet();
			for (File groupDirectory : rootDir.listFiles()) {
				if (groupDirectory.isDirectory() && !groupDirectory.getPath().contains(".svn")) {
					for (File f : groupDirectory.listFiles()) {
						if (f.isFile() && !f.getPath().contains(".svn")) {
							BufferedReader reader = new BufferedReader(new FileReader(f));
							String line;
							while ((line = reader.readLine()) != null) {
								Matcher matcher = wordPattern.matcher(line);
								while (matcher.find()) {
									counter.add(matcher.group().toLowerCase());
								}
							}
							reader.close();
						}
					}
				}
			}

			Writer pw = new BufferedWriter(new PrintWriter("./counts-alphabetical-java.txt"));
			sortAndDisplay(counter.entrySet(), sortByElement, pw);
			pw.close();

			pw = new BufferedWriter(new PrintWriter("./counts-decreasing-java.txt"));
			sortAndDisplay(counter.entrySet(), sortByCount, pw);
			pw.close();

			System.out.println("Finished in " + (System.currentTimeMillis() - timeStart) + " ms");
		}
	}

	private static void sortAndDisplay(Set<Map.Entry<String, Integer>> set, Comparator<Map.Entry<String, Integer>> comp, Writer writer) throws IOException {
		List<Map.Entry<String, Integer>> list = new ArrayList<Map.Entry<String, Integer>>(set);
		Collections.sort(list, comp);
		display(list, writer);
	}

	private static void display(Iterable<java.util.Map.Entry<String, Integer>> list, Writer writer) throws IOException {
		for (Map.Entry<String, Integer> entry : list) {
			writer.write(entry.getKey() + " : " + entry.getValue() + "\n");
		}
	}

	private static class CountingSet extends LinkedHashMap<String, Integer> {
		void add(String s) {
			Integer i = get(s);
			put(s, (i == null) ? Integer.valueOf(1) : Integer.valueOf(i + 1));
		}
	}
}